In [8]:
import numpy as np 
import pandas as pd 

# from subprocess import check_output
# print(check_output(["ls", "../data/"]).decode("utf8"))

In [9]:
train = pd.read_csv('../data/train.csv',
                    dtype={'is_booking':bool,'srch_destination_id':np.int32, 'hotel_cluster':np.int32},
                    usecols=['srch_destination_id','is_booking','hotel_cluster'],
                    chunksize=1000000)

# dtype을 설정하고(bool, np.int32) chunksize로 끊어서 하면 더 빠르게 데이터를 처리할 수 있음!!


aggs = []
print('-'*38)
for chunk in train:
    agg = chunk.groupby(['srch_destination_id',
                         'hotel_cluster'])['is_booking'].agg(['sum','count'])
    agg.reset_index(inplace=True)
    aggs.append(agg)
    print('.',end='')
print('')
aggs = pd.concat(aggs, axis=0)


--------------------------------------
......................................
Out[9]:
srch_destination_id hotel_cluster sum count
0 1 20 0.0 2
1 1 30 0.0 1
2 1 60 0.0 2
3 4 22 1.0 2
4 4 25 1.0 2

In [14]:
CLICK_WEIGHT = 0.05
agg = aggs.groupby(['srch_destination_id','hotel_cluster']).sum().reset_index()
agg.head()


Out[14]:
srch_destination_id hotel_cluster sum count
0 0 3 0.0 2
1 1 20 4.0 26
2 1 30 2.0 22
3 1 57 0.0 1
4 1 60 0.0 17

In [15]:
agg['count'] -= agg['sum']
# sum은 실제 booking과 count를 합친 것..!
agg = agg.rename(columns={'sum':'bookings','count':'clicks'})
agg['relevance'] = agg['bookings'] + CLICK_WEIGHT * agg['clicks']
agg.head()


Out[15]:
srch_destination_id hotel_cluster bookings clicks relevance
0 0 3 0.0 2.0 0.10
1 1 20 4.0 22.0 5.10
2 1 30 2.0 20.0 3.00
3 1 57 0.0 1.0 0.05
4 1 60 0.0 17.0 0.85

In [16]:
def most_popular(group, n_max=5):
    relevance = group['relevance'].values
    hotel_cluster = group['hotel_cluster'].values
    most_popular = hotel_cluster[np.argsort(relevance)[::-1]][:n_max]
    return np.array_str(most_popular)[1:-1] # remove square brackets

In [17]:
%%time
most_pop = agg.groupby(['srch_destination_id']).apply(most_popular)
most_pop = pd.DataFrame(most_pop).rename(columns={0:'hotel_cluster'})
most_pop.head()


Wall time: 18.9 s

In [21]:
%%time
test = pd.read_csv('../data/test.csv',
                    dtype={'srch_destination_id':np.int32},
                    usecols=['srch_destination_id'],)


Wall time: 1.88 s

In [22]:
test.head()


Out[22]:
srch_destination_id
0 12243
1 14474
2 11353
3 8250
4 11812

In [23]:
test = test.merge(most_pop, how='left',left_on='srch_destination_id',right_index=True)
test.head()


Out[23]:
srch_destination_id hotel_cluster
0 12243 5 55 37 11 22
1 14474 5
2 11353 0 31 77 91 96
3 8250 1 45 79 24 54
4 11812 91 42 2 48 59

In [32]:
test[test["hotel_cluster"].isnull() == True].head()


Out[32]:
srch_destination_id hotel_cluster
286 65671 NaN
357 13679 NaN
445 44373 NaN
458 65106 NaN
627 51983 NaN

In [24]:
test["hotel_cluster"].isnull().sum()


Out[24]:
14036

In [33]:
# 이 친구들은 전반적으로 인기 있는 것들을 그냥 추천

most_pop_all = agg.groupby('hotel_cluster')['relevance'].sum().nlargest(5).index
#nlargest(5) -> ~을 기반으로 가장 큰 것을 추천하는 함수
most_pop_all = np.array_str(most_pop_all)[1:-1]
most_pop_all


Out[33]:
'91 48 42 59 28'

In [34]:
test["hotel_cluster"].fillna(most_pop_all,inplace=True)

In [35]:
test.head()


Out[35]:
srch_destination_id hotel_cluster
0 12243 5 55 37 11 22
1 14474 5
2 11353 0 31 77 91 96
3 8250 1 45 79 24 54
4 11812 91 42 2 48 59

In [37]:
%%time
test["hotel_cluster"].to_csv('predicted_with_pandas.csv',header=True, index_label='id')


Wall time: 6.01 s

Public score : 0.30340


In [ ]: